Main Topics: From 5 Topic Solution
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "topic"
summary<-data_text %>%
mutate(paragraphs_dropped = num_paragraphs_prespellcheck - num_paragraphs_postspellcheck)%>%
group_by(Year)%>%
summarize(mean_numwords = mean(numwords_r, na.rm=T),
N = n())
## `summarise()` ungrouping output (override with `.groups` argument)
summary %>%
ggplot(aes(x=Year, y = mean_numwords)) +
geom_line()+
ggtitle("Number of words over time")
summary %>%
ggplot(aes(x=Year, y = N)) +
geom_line()+
ggtitle("Number of docs over time")
#group by decade, future words
v <- c( 'may', 'might', 'future', 'will','optimism', 'pessimism', 'uncertainty', 'certainty', 'outlook', 'risk', 'risky', 'optimistic')
decade_words<- tbl_tokens%>%
group_by(decade)%>%
count(word, sort = TRUE)
decade_totals <- decade_words %>%
group_by(decade)%>%
summarize(total= sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
decade_words<- left_join(decade_words, decade_totals)
## Joining, by = "decade"
decade_words%>%
filter(word %in% v) %>%
mutate(percent = n/total) %>%
ggplot(aes(decade, percent))+
geom_line()+
facet_wrap(~word, scales = "free")+
ggtitle("Future words in text")
#tf-idf
decade_words_idf <- decade_words %>%
bind_tf_idf(word, decade, n)
decade_words_idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(decade) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = as.factor(decade))) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~decade, ncol = 4, scales = "free") +
coord_flip()+
ggtitle("TF-IDF By Decade")
## Selecting by tf_idf
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2519242 134.6 4084261 218.2 NA 4084261 218.2
## Vcells 1010610231 7710.4 3497995719 26687.6 102400 4353357712 33213.5
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Package version: 2.0.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
#Number words
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "topic"
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
## Joining, by = "topic"
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
## Joining, by = "topic"
#Lexicon analysis by topics, MEAN SPLIT SCORES
## Joining, by = "filename"
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2862908 152.9 5300880 283.1 NA 5300880 283.1
## Vcells 1318324636 10058.1 2798396576 21350.1 102400 4353357712 33213.5
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
#merge in numbers
numbers_topic<- numbers %>%
left_join(topic_df)
## Joining, by = "filename"
#wide to long
numbers_topics_long <- numbers_topic %>%
gather(key = topic, value = value, t0.mean.d: t4.mean.d)%>%
filter(value ==1)
#means
number_topics_means<- numbers_topics_long %>%
group_by(Year, topic) %>%
summarize(mean_per_numbers = mean(percent_numbers, na.rm=T))
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
#plot
number_topics_means%>%
ggplot(aes(Year, mean_per_numbers, group = topic))+
geom_point()+
xlim(1930, 2005)+
facet_wrap(~topic)+
ylim(0, 0.075)+
ggtitle("Number words over time")